# -*- coding: utf-8 -*-
# @Time     : 2022/6/20 12:20
# @Author   : JustFly
# @File     : pc_httpxMultithreading_parsel.py
# @Software : PyCharm

"""|~||||||||||真（IO密集操作） 王炸 ！！！！|||||||||~|


httpx多线程 + parsel 以爬取链家二手房信息为例

# 1.获取总页数
# 2.每一页都分配给一个 线程 进行爬取和数据解析
    线程共享内存，可直接通讯


最大页码数: 34
线程开始爬取：https://sh.lianjia.com/ershoufang/pudong/pg1p5/...
.......
线程开始爬取：https://sh.lianjia.com/ershoufang/pudong/pg34p5/...
Write a CSV file to path 浦东_三房_500_800万(httpxMultithreading+parsel).csv Successful.
耗时：1.9866933822631836 s

进程已结束，退出代码为 0


?为什么这么快
本例这种IO密集型操作，消耗的时间大多是等待（而等待时CPU是不需要工作的）
而GIL线程全局解释器锁，一遇到等待就会立即释放GIL供新的线程使用（无意中实现了线程的切换哈哈哈...） ！

相反多进程时由于都在等待IO操作，分配过多的CPU核心也没有用
"""


import re
import csv
import time
import httpx
from parsel import Selector
from fake_useragent import UserAgent

import threading


class HomeLinkSpider(object):
    def __init__(self):
        self.data = list()
        self.path = "浦东_三房_500_800万(httpxMultithreading+parsel).csv"
        self.url = "https://sh.lianjia.com/ershoufang/pudong/p5/"

    def get_max_page(self):
        res = httpx.get(self.url, headers={"User-Agent": UserAgent().random})
        if res.status_code == 200:
            # create Selector instance
            selector = Selector(res.text)
            # use CSS acquire max_page div Box
            a = selector.css('div[class="page-box house-lst-page-box"]')
            # use eval() translate page-data JSON to DICT
            max_page = eval(a[0].xpath('//@page-data').get())['totalPage']
            print(f'最大页码数: {max_page}')
            return max_page
        else:
            print(f'请求失败 status: {res.status_code}')
            return None

    # 解析单页面，需要传入单页面url地址
    def parse_single_page(self, url):
        print(f"线程开始爬取：{url}...")
        res = httpx.get(url, headers={"User-Agent": UserAgent().random})

        selector = Selector(res.text)
        ul = selector.css('ul.sellListContent')[0]
        li_list = ul.css('li')
        for li in li_list:
            detail = dict()
            detail['title'] = li.css('div.title a::text').get()

            #  2室1厅 | 74.14平米 | 南 | 精装 | 高楼层(共6层) | 1999年建 | 板楼
            house_info = li.css('div.houseInfo::text').get()
            house_info_list = house_info.split(" | ")
            # 户型 面积 朝向
            detail['bedroom'] = house_info_list[0]
            detail['area'] = house_info_list[1]
            detail['direction'] = house_info_list[2]
            # 楼层
            floor_pattern = re.compile(r'\d{1,2}')
            match1 = re.search(floor_pattern, house_info_list[4])
            if match1:
                detail['floor'] = match1.group()
            else:
                detail['floor'] = "未知"
            # 年份
            year_pattern = re.compile(r'\d{4}')
            match2 = re.search(year_pattern, house_info_list[5])
            if match2:
                detail['year'] = match2.group()
            else:
                detail['year'] = "未知"

            # 文兰小区 - 塘桥， 提取小区名 及所在区域
            position_info = li.css('div.positionInfo a::text').getall()
            detail['house'] = position_info[0]
            detail['location'] = position_info[1]

            # 650万， 匹配售价650
            price_pattern = re.compile(r'\d+')
            total_price = li.css('div.totalPrice span::text').get()
            detail['total_price'] = total_price
            # detail['total_price'] = re.search(price_pattern, total_price).group()

            # 64,5182元/平米，匹配单价645182
            unit_price = li.css('div.unitPrice span::text').get()
            unit_price = unit_price.replace(",", "")
            detail['unit_price'] = re.search(price_pattern, unit_price).group()
            # print(detail)
            self.data.append(detail)

    def parse_page(self):
        max_page = self.get_max_page()

        thread_list = []
        # 为每个任务分配一个线程
        for i in range(1, max_page + 1):
            url = f"https://sh.lianjia.com/ershoufang/pudong/pg{i}p5/"
            t = threading.Thread(target=self.parse_single_page, args=(url, ))
            thread_list.append(t)

        for t in thread_list:
            t.start()
        for t in thread_list:
            t.join()

    def write_csv_file(self):
        head = ["标题", "小区", "房厅", "面积", "朝向", "楼层", "年份", "位置", "总价(万)", "单价(元/平方米)"]
        keys = ["title", "house", "bedroom", "area", "direction", "floor", "year", "location", "total_price",
                "unit_price"]

        try:
            with open(self.path, 'w', newline='', encoding='utf_8_sig') as csv_file:
                writer = csv.writer(csv_file, dialect='excel')
                if head is not None:
                    writer.writerow(head)
                for item in self.data:
                    row_data = []
                    for k in keys:
                        row_data.append(item[k])
                        # print(row_data)
                    writer.writerow(row_data)
                print("Write a CSV file to path %s Successful." % self.path)
        except Exception as e:
            print("Fail to write CSV to path: %s, Case: %s" % (self.path, e))


if __name__ == '__main__':
    start = time.time()

    home_link_spider = HomeLinkSpider()
    home_link_spider.parse_page()
    home_link_spider.write_csv_file()

    end = time.time()
    print(f"耗时：{end - start} s")
























